;//==========================================================================
;//
;//  THIS CODE AND INFORMATION IS PROVIDED "AS IS" WITHOUT WARRANTY OF ANY
;//  KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
;//  IMPLIED WARRANTIES OF MERCHANTABILITY AND/OR FITNESS FOR A PARTICULAR
;//  PURPOSE.
;//
;//  Copyright (c) 1999 - 2001  On2 Technologies Inc. All Rights Reserved.
;//
;//--------------------------------------------------------------------------

#include "theora/theora.h"
#include "codec_internal.h"
#include "dsp.h"


static const  ogg_int64_t xC1S7 = 0x0fb15fb15fb15fb15LL;
static const  ogg_int64_t xC2S6 = 0x0ec83ec83ec83ec83LL;
static const  ogg_int64_t xC3S5 = 0x0d4dbd4dbd4dbd4dbLL;
static const  ogg_int64_t xC4S4 = 0x0b505b505b505b505LL;
static const  ogg_int64_t xC5S3 = 0x08e3a8e3a8e3a8e3aLL;
static const  ogg_int64_t xC6S2 = 0x061f861f861f861f8LL;
static const  ogg_int64_t xC7S1 = 0x031f131f131f131f1LL;


static __inline void Transpose_mmx( ogg_int16_t *InputData1, ogg_int16_t *OutputData1,
                                 ogg_int16_t *InputData2, ogg_int16_t *OutputData2)
{

    __asm {
        align 16
            mov     eax, InputData1
            mov     ebx, InputData2
            mov     ecx, OutputData1
            mov     edx, OutputData2


        movq		mm0, [eax]		;	/* mm0 = a0 a1 a2 a3 */
        movq		mm4, [ebx]		;	/* mm4 = e4 e5 e6 e7 */
        movq		mm1, [16 + eax]		;	/* mm1 = b0 b1 b2 b3 */
        movq		mm5, [16 + ebx]		;	/* mm5 = f4 f5 f6 f7 */
        movq		mm2, [32 + eax]		;	/* mm2 = c0 c1 c2 c3 */
        movq		mm6, [32 + ebx]		;	/* mm6 = g4 g5 g6 g7 */
        movq		mm3, [48 + eax]		;	/* mm3 = d0 d1 d2 d3 */
        movq		[16 + ecx], mm1		;	/* save  b0 b1 b2 b3 */
        movq		mm7, [48 + ebx]		;	/* mm7 = h0 h1 h2 h3 */
	        ;	/* Transpose 2x8 block */
        movq		mm1, mm4		;	/* mm1 = e3 e2 e1 e0 */
        punpcklwd		mm4, mm5		;	/* mm4 = f1 e1 f0 e0 */
        movq		[ecx], mm0		;	/* save a3 a2 a1 a0  */
        punpckhwd		mm1, mm5		;	/* mm1 = f3 e3 f2 e2 */
        movq		mm0, mm6		;	/* mm0 = g3 g2 g1 g0 */
        punpcklwd		mm6, mm7		;	/* mm6 = h1 g1 h0 g0 */
        movq		mm5, mm4		;	/* mm5 = f1 e1 f0 e0 */
        punpckldq		mm4, mm6		;	/* mm4 = h0 g0 f0 e0 = MM4 */
        punpckhdq		mm5, mm6		;	/* mm5 = h1 g1 f1 e1 = MM5 */
        movq		mm6, mm1		;	/* mm6 = f3 e3 f2 e2 */
        movq		[edx], mm4		;	
        punpckhwd		mm0, mm7		;	/* mm0 = h3 g3 h2 g2 */
        movq		[16 + edx], mm5		;	
        punpckhdq		mm6, mm0		;	/* mm6 = h3 g3 f3 e3 = MM7 */
        movq		mm4, [ecx]		;	/* mm4 = a3 a2 a1 a0 */
        punpckldq		mm1, mm0		;	/* mm1 = h2 g2 f2 e2 = MM6 */
        movq		mm5, [16 + ecx]		;	/* mm5 = b3 b2 b1 b0 */
        movq		mm0, mm4		;	/* mm0 = a3 a2 a1 a0 */
        movq		[48 + edx], mm6		;	
        punpcklwd		mm0, mm5		;	/* mm0 = b1 a1 b0 a0 */
        movq		[32 + edx], mm1		;	
        punpckhwd		mm4, mm5		;	/* mm4 = b3 a3 b2 a2 */
        movq		mm5, mm2		;	/* mm5 = c3 c2 c1 c0 */
        punpcklwd		mm2, mm3		;	/* mm2 = d1 c1 d0 c0 */
        movq		mm1, mm0		;	/* mm1 = b1 a1 b0 a0 */
        punpckldq		mm0, mm2		;	/* mm0 = d0 c0 b0 a0 = MM0 */
        punpckhdq		mm1, mm2		;	/* mm1 = d1 c1 b1 a1 = MM1 */
        movq		mm2, mm4		;	/* mm2 = b3 a3 b2 a2 */
        movq		[ecx], mm0		;	
        punpckhwd		mm5, mm3		;	/* mm5 = d3 c3 d2 c2 */
        movq		[16 + ecx], mm1		;	
        punpckhdq		mm4, mm5		;	/* mm4 = d3 c3 b3 a3 = MM3 */
        punpckldq		mm2, mm5		;	/* mm2 = d2 c2 b2 a2 = MM2 */
        movq		[48 + ecx], mm4		;	
        movq		[32 + ecx], mm2		;	

    };


}

static __inline void Fdct_mmx( ogg_int16_t *InputData1, ogg_int16_t *InputData2, ogg_int16_t *temp)
{

    __asm {
        align 16


                mov     eax, InputData1
                mov     ebx, InputData2
                mov     ecx, temp
        movq		mm0, [eax]		;	
        movq		mm1, [16 + eax]		;	
        movq		mm2, [48 + eax]		;	
        movq		mm3, [16 + ebx]		;	
        movq		mm4, mm0		;	
        movq		mm5, mm1		;	
        movq		mm6, mm2		;	
        movq		mm7, mm3		;	
				        ;	
        paddsw		mm0, [48 + ebx]		;	/* mm0 = ip0 + ip7 = is07 */
        paddsw		mm1, [32 + eax]		;	/* mm1 = ip1 + ip2 = is12 */
        paddsw		mm2, [ebx]		;	/* mm2 = ip3 + ip4 = is34 */
        paddsw		mm3, [32 + ebx]		;	/* mm3 = ip5 + ip6 = is56 */
        psubsw		mm4, [48 + ebx]		;	/* mm4 = ip0 - ip7 = id07 */
        psubsw		mm5, [32 + eax]		;	/* mm5 = ip1 - ip2 = id12 */
				        ;	
        psubsw		mm0, mm2		;	/* mm0 = is07 - is34 */
				        ;	
        paddsw		mm2, mm2		;	
				        ;	
        psubsw		mm6, [ebx]		;	/* mm6 = ip3 - ip4 = id34 */
				        ;	
        paddsw		mm2, mm0		;	/* mm2 = is07 + is34 = is0734 */
        psubsw		mm1, mm3		;	/* mm1 = is12 - is56 */
        movq		[ecx], mm0		;	/* Save is07 - is34 to free mm0; */
        paddsw		mm3, mm3		;	
        paddsw		mm3, mm1		;	/* mm3 = is12 + 1s56	= is1256 */
				        ;	
        psubsw		mm7, [32 + ebx]		;	/* mm7 = ip5 - ip6 = id56 */
	        ;	/* ------------------------------------------------------------------- */
        psubsw		mm5, mm7		;	/* mm5 = id12 - id56 */
        paddsw		mm7, mm7		;	
        paddsw		mm7, mm5		;	/* mm7 = id12 + id56 */
	        ;	/* ------------------------------------------------------------------- */
        psubsw		mm2, mm3		;	/* mm2 = is0734 - is1256 */
        paddsw		mm3, mm3		;	
				        ;	
        movq		mm0, mm2		;	/* make a copy */
        paddsw		mm3, mm2		;	/* mm3 = is0734 + is1256 */
				        ;	
        pmulhw		mm0, xC4S4		;	/* mm0 = xC4S4 * ( is0734 - is1256 ) - ( is0734 - is1256 ) */
        paddw		mm0, mm2		;	/* mm0 = xC4S4 * ( is0734 - is1256 ) */
        psrlw		mm2, 15		;	
        paddw		mm0, mm2		;	/* Truncate mm0, now it is op[4] */
				        ;	
        movq		mm2, mm3		;	
        movq		[ebx], mm0		;	/* save ip4, now mm0,mm2 are free */
				        ;	
        movq		mm0, mm3		;	
        pmulhw		mm3, xC4S4		;	/* mm3 = xC4S4 * ( is0734 +is1256 ) - ( is0734 +is1256 ) */
				        ;	
        psrlw		mm2, 15		;	
        paddw		mm3, mm0		;	/* mm3 = xC4S4 * ( is0734 +is1256 )	 */
        paddw		mm3, mm2		;	/* Truncate mm3, now it is op[0] */
				        ;	
        movq		[eax], mm3		;	
	        ;	/* ------------------------------------------------------------------- */
        movq		mm3, [ecx]		;	/* mm3 = irot_input_y */
        pmulhw		mm3, xC2S6	;	/* mm3 = xC2S6 * irot_input_y - irot_input_y */
				        ;	
        movq		mm2, [ecx]		;	
        movq		mm0, mm2		;	
				        ;	
        psrlw		mm2, 15		;	/* mm3 = xC2S6 * irot_input_y */
        paddw		mm3, mm0		;	
				        ;	
        paddw		mm3, mm2		;	/* Truncated */
        movq		mm0, mm5		;	
				        ;	
        movq		mm2, mm5		;	
        pmulhw		mm0, xC6S2		;	/* mm0 = xC6S2 * irot_input_x */
				        ;	
        psrlw		mm2, 15		;	
        paddw		mm0, mm2		;	/* Truncated */
				        ;	
        paddsw		mm3, mm0		;	/* ip[2] */
        movq		[32 + eax], mm3		;	/* Save ip2 */
				        ;	
        movq		mm0, mm5		;	
        movq		mm2, mm5		;	
				        ;	
        pmulhw		mm5, xC2S6		;	/* mm5 = xC2S6 * irot_input_x - irot_input_x */
        psrlw		mm2, 15		;	
				        ;	
        movq		mm3, [ecx]		;	
        paddw		mm5, mm0		;	/* mm5 = xC2S6 * irot_input_x */
				        ;	
        paddw		mm5, mm2		;	/* Truncated */
        movq		mm2, mm3		;	
				        ;	
        pmulhw		mm3, xC6S2		;	/* mm3 = xC6S2 * irot_input_y */
        psrlw		mm2, 15		;	
				        ;	
        paddw		mm3, mm2		;	/* Truncated */
        psubsw		mm3, mm5		;	
				        ;	
        movq		[32 + ebx], mm3		;	
	        ;	/* ------------------------------------------------------------------- */
        movq		mm0, xC4S4		;	
        movq		mm2, mm1		;	
        movq		mm3, mm1		;	
				        ;	
        pmulhw		mm1, mm0		;	/* mm0 = xC4S4 * ( is12 - is56 ) - ( is12 - is56 ) */
        psrlw		mm2, 15		;	
				        ;	
        paddw		mm1, mm3		;	/* mm0 = xC4S4 * ( is12 - is56 ) */
        paddw		mm1, mm2		;	/* Truncate mm1, now it is icommon_product1 */
				        ;	
        movq		mm2, mm7		;	
        movq		mm3, mm7		;	
				        ;	
        pmulhw		mm7, mm0		;	/* mm7 = xC4S4 * ( id12 + id56 ) - ( id12 + id56 ) */
        psrlw		mm2, 15		;	
				        ;	
        paddw		mm7, mm3		;	/* mm7 = xC4S4 * ( id12 + id56 ) */
        paddw		mm7, mm2		;	/* Truncate mm7, now it is icommon_product2 */
	        ;	/* ------------------------------------------------------------------- */
        pxor		mm0, mm0		;	/* Clear mm0 */
        psubsw		mm0, mm6		;	/* mm0 = - id34 */
				        ;	
        psubsw		mm0, mm7		;	/* mm0 = - ( id34 + idcommon_product2 ) */
        paddsw		mm6, mm6		;	
        paddsw		mm6, mm0		;	/* mm6 = id34 - icommon_product2 */
				        ;	
        psubsw		mm4, mm1		;	/* mm4 = id07 - icommon_product1 */
        paddsw		mm1, mm1		;	
        paddsw		mm1, mm4		;	/* mm1 = id07 + icommon_product1 */
	        ;	/* ------------------------------------------------------------------- */
        movq		mm7, xC1S7		;	
        movq		mm2, mm1		;	
				        ;	
        movq		mm3, mm1		;	
        pmulhw		mm1, mm7		;	/* mm1 = xC1S7 * irot_input_x - irot_input_x */
				        ;	
        movq		mm7, xC7S1		;	
        psrlw		mm2, 15		;	
				        ;	
        paddw		mm1, mm3		;	/* mm1 = xC1S7 * irot_input_x */
        paddw		mm1, mm2		;	/* Trucated */
				        ;	
        pmulhw		mm3, mm7		;	/* mm3 = xC7S1 * irot_input_x */
        paddw		mm3, mm2		;	/* Truncated */
				        ;	
        movq		mm5, mm0		;	
        movq		mm2, mm0		;	
				        ;	
        movq		mm7, xC1S7		;	
        pmulhw		mm0, mm7		;	/* mm0 = xC1S7 * irot_input_y - irot_input_y */
				        ;	
        movq		mm7, xC7S1		;	
        psrlw		mm2, 15		;	
				        ;	
        paddw		mm0, mm5		;	/* mm0 = xC1S7 * irot_input_y */
        paddw		mm0, mm2		;	/* Truncated */
				        ;	
        pmulhw		mm5, mm7		;	/* mm5 = xC7S1 * irot_input_y */
        paddw		mm5, mm2		;	/* Truncated */
				        ;	
        psubsw		mm1, mm5		;	/* mm1 = xC1S7 * irot_input_x - xC7S1 * irot_input_y = ip1 */
        paddsw		mm3, mm0		;	/* mm3 = xC7S1 * irot_input_x - xC1S7 * irot_input_y = ip7 */
				        ;	
        movq		[16 + eax], mm1		;	
        movq		[48 + ebx], mm3		;	
	        ;	/* ------------------------------------------------------------------- */
        movq		mm0, xC3S5		;	
        movq		mm1, xC5S3		;	
				        ;	
        movq		mm5, mm6		;	
        movq		mm7, mm6		;	
				        ;	
        movq		mm2, mm4		;	
        movq		mm3, mm4		;	
				        ;	
        pmulhw		mm4, mm0		;	/* mm4 = xC3S5 * irot_input_x - irot_input_x */
        pmulhw		mm6, mm1		;	/* mm6 = xC5S3 * irot_input_y - irot_input_y */
				        ;	
        psrlw		mm2, 15		;	
        psrlw		mm5, 15		;	
				        ;	
        paddw		mm4, mm3		;	/* mm4 = xC3S5 * irot_input_x */
        paddw		mm6, mm7		;	/* mm6 = xC5S3 * irot_input_y */
				        ;	
        paddw		mm4, mm2		;	/* Truncated */
        paddw		mm6, mm5		;	/* Truncated */
				        ;	
        psubsw		mm4, mm6		;	/* ip3 */
        movq		[48 + eax], mm4		;	
				        ;	
        movq		mm4, mm3		;	
        movq		mm6, mm7		;	
				        ;	
        pmulhw		mm3, mm1		;	/* mm3 = xC5S3 * irot_input_x - irot_input_x */
        pmulhw		mm7, mm0		;	/* mm7 = xC3S5 * irot_input_y - irot_input_y */
				        ;	
        paddw		mm4, mm2		;	
        paddw		mm6, mm5		;	
				        ;	
        paddw		mm3, mm4		;	/* mm3 = xC5S3 * irot_input_x */
        paddw		mm7, mm6		;	/* mm7 = xC3S5 * irot_input_y */
				        ;	
        paddw		mm3, mm7		;	/* ip5 */
        movq		[16 + ebx], mm3		;	

};

}


static void fdct_short__mmx ( ogg_int16_t *InputData, ogg_int16_t *OutputData)
{

  static ogg_int16_t tmp[32];
  ogg_int16_t* align_tmp = (ogg_int16_t*)((unsigned char*)tmp + (16 - ((int)tmp)&15));


  Transpose_mmx(InputData, OutputData, InputData + 4, OutputData + 4);
  Fdct_mmx(OutputData, OutputData + 4, align_tmp);

  Transpose_mmx(InputData + 32, OutputData + 32, InputData + 36, OutputData + 36);
  Fdct_mmx(OutputData+32, OutputData + 36, align_tmp);

  Transpose_mmx(OutputData, OutputData, OutputData + 32, OutputData + 32);
  Fdct_mmx(OutputData, OutputData + 32, align_tmp);

  Transpose_mmx(OutputData + 4, OutputData + 4, OutputData + 36, OutputData + 36);
  Fdct_mmx(OutputData + 4, OutputData + 36, align_tmp);

  __asm     emms

}

void dsp_mmx_fdct_init(DspFunctions *funcs)
{
  TH_DEBUG("enabling accelerated x86_32 mmx fdct function.\n");
  funcs->fdct_short = fdct_short__mmx;
}
